kernel_path =  (File.expand_path(__FILE__).split("/"))[0..-2].join("/") + '/'

require kernel_path + 'log_lib'
require 'rubygems'
$: << '/home/rsyncuser/.gem/ruby/1.9.0/gems/bio-1.4.0/lib/'
require 'bio' unless $in_intel
#parsing pdb files lib

RESOLUTION_THRESHOLD = 3.5
RESOLUTION_HIGH = "[w][1] Resolution too high "
RESOLUTION_FAILED = "[e][1] Couldn't get resolution "
PDB_FAILED = "[e][2] get_PDB_structure"
PDB_FAILED1 = "[e][21] get_PDB_structure[nil]"
NOT_XRAY = "[w][2] Not X-RAY "

#get FILE io handle for filename - knows how to handle gzipped files
def get_io_handle ( filename )
	return IO.popen("zcat #{filename}") if filename.include? ".gz" #gzipped
	return IO.popen("cat #{filename} ") #File.new(filename,"r")
end
#-------------------------------------------------------------------
def get_PDB_structure(filename)
	
	#require 'bio/db/pdb.rb'
	file = get_io_handle filename
	str = []
	#read file to str
	while line = file.gets do 
		str << line
	end
	file.close
	
	structure = Bio::PDB.new(str.join(""))
	str = nil
	#sanity checks:
	raise "#{PDB_FAILED}(#{filename}) - parsing failed" if !structure 
	
	typeP = structure.record("EXPDTA").first.technique
	raise "#{NOT_XRAY} #{filename} - #{typeP}" if !(typeP.to_s.include?("RAY"))
	
	resolution = 42
	begin
	    resolution =structure.remark(2)[0].comment.split()[0].to_f 
	rescue => err
		raise "#{RESOLUTION_FAILED} for #{filename} because: #{err}"
	end
	raise "#{RESOLUTION_HIGH} (#{resolution}) for #{filename}" if RESOLUTION_THRESHOLD < resolution
 
	return structure
end
#-------------------------------------------------------------------
#note that when have no CA in residue we ignore it
#when we have several CAs wer take the first one
#returns hash where key is chain name and values are seq.strings
def get_seqres_from_atoms_sections(filename,ptr_precision=nil, structure=nil)
	struct = get_PDB_structure(filename)
	result = {}
	seq = []
	raise "Can't pdb parse #{filename}" if !struct 
	puts struct.models.size if struct.models.size != 1
struct.each do |model|	
	
	model.each { |chain|
		chain.each { |residue|
			if residue['CA']
				g = residue['CA'].resName
				seq << Bio::AminoAcid.three2one((g.upcase)[0..0] + (g.downcase)[1..-1])
			end
		}
		result[chain.id.to_s] = seq.join('') unless chain.id.to_s == 'W'#ask Rachel what is w
		seq = []
	}
end	
#(struct.remark(2))[0]['resolution'].to_f
	ptr_precision << struct.remark(2)[0].comment.split()[0].to_f  if ptr_precision
	structure << struct if structure
	return result
end
#-------------------------------------------------------------------
def get_CAs_from_structure(filename, struct)
	raise "[e][3] get_CAs_from_structure #{filename}" if !struct 
	result = {}
	seq = []
	struct[nil].each { |chain|
		chain.each { |residue|
			if residue['CA']
				g = residue['CA'].xyz
				seq << "#{g[0]} #{g[1]} #{g[2]}"
			end
		}
		result[chain.id.to_s] = seq.join("\n") unless chain.id.to_s == 'W'#ask Rachel what is w
		seq = []
	}
	return result
end
#-------------------------------------------------------------------
#note that when have no CA in residue we ignore it
#when we have several CAs wer take the first one
#returns hash where key is chain name and values are xyzs
def get_CAs_from_atoms_sections(filename)
	struct = get_PDB_structure(filename)
	return get_CAs_from_structure(filename, struct)
end
#-------------------------------------------------------------------
def save_CAs_from_structure(fname,suffix,structure)
		hsh = get_CAs_from_structure(fname, structure)
		hsh.keys.each do |k|
			next if (k.chomp.strip.size) < 1
			new_fname = "#{fname}#{k}#{suffix}".gsub('.ent','').sub('/pdb','/')

			f = File.new(new_fname,'w')
			f.puts hsh[k]
			f.flush
			f.close
		end
end
#-------------------------------------------------------------------
def save_CAs_from_atoms_sections(fname,suffix)
		hsh = get_CAs_from_atoms_sections(fname)
		hsh.keys.each do |k|
			next if (k.chomp.strip.size) < 1
			new_fname = "#{fname}#{k}#{suffix}".gsub('.ent','').sub('/pdb','/')
			f = File.new(new_fname,'w')
			f.puts hsh[k]
			f.flush
			f.close
		end
end
#-------------------------------------------------------------------
def load_rxyz(filename)
	if Dir[filename].empty?
		path = filename.split("/")[0..-2].join("/")
		name = "pdb" + filename.split("/")[-1].split(".")[0][0..-2] + ".ent"
		suffix = filename.split("/")[-1].split(".")[1]
		#puts "need #{path}/#{name} to create #{suffix}"
		save_CAs_from_atoms_sections("#{path}/#{name}","." +suffix)
	end
	return %x[cat #{filename}].split("\n")
end
#-------------------------------------------------------------------
def create_rxyz_files(directory_mask,suffix='.rxyz')
	counter = 0
	list = Dir[directory_mask.split("/")[0..-2].join("/") + '/*' + suffix]
	require 'set'
	set = list.to_set
	
	Dir[directory_mask].each do |fname|
		puts counter if (counter+=1)%1024 == 1
		new_fname = "#{fname}A#{suffix}".gsub('.ent','').sub('/pdb','/')
		next if set.include? new_fname
		save_CAs_from_atoms_sections(fname,suffix)
	end
end
#-------------------------------------------------------------------
def create_chain_files(directory_mask,suffix='.seq')
	counter = 0
	list = Dir[directory_mask.split("/")[0..-2].join("/") + '/*' + suffix]
	require 'set'
	set = list.to_set
	
	Dir[directory_mask].each do |fname|
		 
		puts counter if (counter+=1)%1024 == 1
		new_fname = "#{fname}A#{suffix}".gsub('.ent','').sub('/pdb','/')
		next if set.include? new_fname
		puts new_fname
		hsh = get_seqres_from_atoms_sections(fname)
		hsh.keys.each do |k|
			next if (k.chomp.strip.size) < 1
			new_fname = "#{fname}#{k}#{suffix}".gsub('.ent','').sub('/pdb','/')
			next if set.include? new_fname
			
			f = File.new(new_fname,'w')
			f.puts ">#{fname.split("/")[-1]}#{k}".gsub('.ent','').sub('pdb','')
			f.puts hsh[k]
			f.flush
			f.close
		end
	end
end
#-------------------------------------------------------------------
def get_xyz_from_single_chain(chain, filename,chain_name)
	xyz = []
	#redisues (of atom section) iterator
	chain.each { |r|
		if r['CA'] 
			g = r['CA'].resName
			letter = Bio::AminoAcid.three2one((g.upcase)[0..0] + (g.downcase)[1..-1])
			coords = r['CA'].xyz
			if ! letter
				warning_log "3_2_1 failed for #{g} in #{filename}:#{chain_name}" if g != 'UNK'
				letter = 'X'
			end
			xyz << [ coords, letter.to_s]
			
		end
	}
	
	return xyz
end
#-------------------------------------------------------------------
#note! we need xyz's of ca's only. but! some res. have more than 1 ca - we are taking the first one
def get_xyz_for_chain( filename, chain_name )
	structure = get_PDB_structure(filename)
	if !structure or !structure[nil] or !structure[nil][chain_name]
		err_log "parsing chain error in #{filename}:#{chain_name}!" 
		return nil
	end
	
	chain = structure[nil][chain_name]
	xyz = get_xyz_from_single_chain(chain, filename,chain_name)
	structure = nil
	return xyz
end
#-------------------------------------------------------------------
def print_xyz_to_file(xyz,filename,chain_name,offset=0)
	return if xyz.size < 4

	f = File.new(filename + "." + chain_name,"w")
	xyz.each do |pair|
		f.puts "#{pair[0][0]},#{pair[0][1]},#{pair[0][2]}: #{pair[1]}"
	end	
	f.close		
end
#-------------------------------------------------------------------
def print_crds_to_file(xyz,filename)
	return if xyz.size < 4

	f = File.new(filename,"w")
	xyz.each do |pair|
		if pair
			f.puts "#{pair[0]} #{pair[1]} #{pair[2]}"
		else
			f.puts "-"
		end
	end	
	f.close		
end
#-------------------------------------------------------------------
def get_xyz_for_all_chains(filename)
	xyz = []
	offset = nil
	structure = get_PDB_structure(filename)
	if !structure or !structure[nil] 
		err_log "parsing chain error in #{filename}!" 
		return nil
	end
	structure[nil].each do |chain|
		xyz = get_xyz_from_single_chain(chain, filename, chain.chain_id.to_s)
		seq = ''
		inflated = needle(seq,xyz[1],xyz[0],nil)
		print_xyz_to_file(xyz, filename, chain.chain_id) if xyz and chain.chain_id.to_s.size > 0
	end
	
end
#---------------------------------------------------
def needle(sequence, reference, data=reference, fill_with = '-')
	raise "wrong sized input within Needleman-Wunsch" if reference.size != data.size
	gap = -1
	rows = reference.length + 1
	cols = sequence.length + 1
	a = Array.new(cols){Array.new(rows)} 
	for i in 0...(rows) do a[i][0] = 0 end
	for j in 0...(cols) do a[0][j] = 0 end

	for i in 1...(rows)
		for j in 1...(cols)
			mismatch_pay = (sequence[j-1]==reference[i-1])? 0 : -1
			choice1 = a[i-1][j-1] + mismatch_pay 
			choice2 = a[i-1][j] + gap
			choice3 = a[i][j-1] + gap
			a[i][j] = [choice1, choice2, choice3].max
		end
	end
			
	ref = []
	
	i = reference.length 
	j = sequence.length
	while (i > 0 and j > 0)
		score = a[i][j]
		score_diag = a[i-1][j-1]
		score_up = a[i][j-1]
		score_left = a[i-1][j]
		mismatch_pay = (sequence[j-1]==reference[i-1])? 0 : -1
		if (score == score_diag + mismatch_pay)
			ref = [data[i-1]] + ref
			#seq = sequence[j-1] + seq
			i -= 1
			j -= 1
		elsif (score == score_left + gap)
			ref = [data[i-1]] + ref
			#seq = [fill_with] + seq
			i -= 1
		elsif (score == score_up + gap)
			ref = [fill_with] + ref
			#seq = sequence[j-1] + seq
			j -= 1
		end
	end
	
	while (i > 0)
		ref = [data[i-1]] + ref
		#seq = fill_with + seq
		i -= 1		
	end
	
	while (j > 0)
		ref = [fill_with] + ref
		#seq = data[j-1] + seq
		j -= 1
	end
	#raise "Misaligned! " if seq.size != ref.size
	return ref 
end

#-------------------------------------------------------------------
def inflate_atoms_and_store (limit=12345678, suffix='.3c')
	path_unzipped= "/home/snepomny/structures/data/open/"
	set_w_to_file "/tmp/warn.log"
 	set_e_to_file "/tmp/errors.log"
	counter = 0
	arr = Dir["#{path_unzipped}*#{suffix}"]
	require 'set'
	set = arr.to_set
 	Dir["#{path_unzipped}*.ent.*"].each do |path| 
		
		counter += 1
		return if counter>limit
		puts counter.to_s if counter%1001 == 1
	if ! path.include? suffix
	begin
		xyz = (get_xyz_residues  path)
		fa_name = path.split("/")[-1..-1].join("").gsub(".ent.","_").gsub("pdb","")
		seq = get_fasta_by_name(fa_name).split("")
			if ! set.include? path + suffix	
				crds = needle(seq,xyz[0],xyz[1],nil)
				print_crds_to_file(crds,path + suffix) 
			end #if
		rescue Exception => e
			puts path + "\n" + e
		end #catch
		end #if 
	end #bigger if
	
	
end

#-------------------------------------------------------------------
def convert_verb_xyz_to_simple_xyz (filename)
	l = %x[cat #{filename}]
	f = File.new(filename + ".3", "w")
	l.each do |line|
		#44.058,-15.325,17.941: C
		f.puts line.split(":")[0].split(",").join(" ")
	end
	f.close
end
#-------------------------------------------------------------------
def get_verb_xyz_residues (filename)
	
	if Dir[filename].empty?
		pdb = filename.split(".")[0..-2].join(".")
		puts "trying to get_verb in #{pdb}"
		get_xyz_for_all_chains pdb
	end
	raise "no file #{filename}"  if Dir[filename].empty?
	
	l = %x[cat #{filename}]
	arr = []
	l.each do |line|
		arr << line.split(":")[1].chomp.strip
	end
	return arr
end
#-------------------------------------------------------------------
def get_xyz_residues (filename)
	raise "no file #{filename}"  if Dir[filename].empty?
	
	l = %x[cat #{filename}]
	arr1 = []
	arr2 = []
	l.each do |line|
		t = line.split(":")
		arr1 << t[1].chomp.strip
		arr2 << t[0].split(",")
	end
	return [arr1,arr2]
end

#-------------------------------------------------------------------
#todo: validate this thing using data from remark 465 and offsets and don't know what
def preproc_xyz	
	path_unzipped= "/home/snepomny/structures/data/open/"
	set_w_to_file "/tmp/warn.log"
 	set_e_to_file "/tmp/errors.log"
	counter = 0
	arr = Dir["#{path_unzipped}*.ent.A"]
	require 'set'
	set = arr.to_set
 	Dir["#{path_unzipped}*.ent"].each do |path| 
		begin 
			#puts "hit"  if  set.include?(path+".A")
			get_xyz_for_all_chains path  if ! set.include?(path+".A")
			#path_unzipped + pair[0] # get_xyz_for_chain(path_unzipped + pair[0],pair[1]) 
			counter = counter+1
			puts counter.to_s  if counter % 1000 == 1
		rescue
			err_log "exception  in #{path}!" 
		end
	end
end